☑ A quick guide to Data Cleaning
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
specifications=pd.read_csv("./datasets/specifications.csv")
annual_sales=pd.read_csv("./datasets/annual_sales.csv")
df=pd.concat([specifications,annual_sales], axis=1, join="inner")
for i in ['Minimum_Turning_Radius','Ex-Showroom_Price','Displacement',
'Fuel_Tank_Capacity','Height','Width','Length','Front_Track','Rear_Track',
'City_Mileage','Highway_Mileage','ARAI_Certified_Mileage','ARAI_Certified_Mileage_for_CNG',
'Kerb_Weight','Ground_Clearance','Wheelbase','Boot_Space']:
df[i]=df[i].str.replace(r'\D','')
df[i]=pd.to_numeric(df[i])
for i in range(len(df.Basic_Warranty)):
if type(df.Basic_Warranty[i])==str:df.Basic_Warranty[i]=int(df.Basic_Warranty[i][0])
df.tail(3)
| Make | Model | Variant | Ex-Showroom_Price | Displacement | Cylinders | Valves_Per_Cylinder | Drivetrain | Cylinder_Configuration | Emission_Norm | ... | 2010 | 2009 | 2008 | 2007 | 2006 | 2005 | 2004 | 2003 | 2002 | 2001 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 770 | Ford | Endeavour | 2.2L 4X2 Mt Titanium | 2920000 | 2198.0 | 4.0 | 4.0 | RWD (Rear Wheel Drive) | In-line | BS IV | ... | 1447 | 1067 | 1059 | 1423 | 1347 | 1524 | 963 | 626 | 119 | 0 |
| 771 | Mini | Cooper 3 Door | Cooper D | 2990000 | 1496.0 | 3.0 | 4.0 | FWD (Front Wheel Drive) | In-line | BS IV | ... | 6392 | 3633 | 1879 | 3232 | 3547 | 4614 | 3901 | 1507 | 0 | 0 |
| 772 | Mini | Cooper 3 Door | Cooper S | 3420000 | 1998.0 | 4.0 | 4.0 | FWD (Front Wheel Drive) | In-line | BS IV | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 rows × 160 columns
df.drop(['Drivetrain',
'Cylinder_Configuration',
'Emission_Norm',
'Engine_Location','Fuel_System','ARAI_Certified_Mileage',
'ARAI_Certified_Mileage_for_CNG','Ground_Clearance',
'Front_Brakes',
'Rear_Brakes',
'Front_Suspension',
'Rear_Suspension',
'Front_Track',
'Rear_Track',
'Front_Tyre_&_Rim',
'Rear_Tyre_&_Rim',
'Power_Steering',
'Power_Windows',
'Power_Seats',
'Keyless_Entry','Torque',
'Odometer','Tachometer',
'Tripmeter','Start_/_Stop_Button',
'12v_Power_Outlet','Aux-in_Compatibility',
'Average_Fuel_Consumption','Boot-lid_Opener',
'Boot_Space','Central_Locking',
'Child_Safety_Locks',
'Clock',
'Cup_Holders',
'Distance_to_Empty',
'Door_Pockets',
'Engine_Malfunction_Light',
'Extended_Warranty',
'FM_Radio',
'Fuel-lid_Opener',
'Fuel_Gauge',
'Handbrake',
'Instrument_Console',
'Low_Fuel_Warning',
'Minimum_Turning_Radius',
'Multifunction_Display',
'Sun_Visor',
'Third_Row_AC_Vents',
'Ventilation_System',
'Auto-Dimming_Rear-View_Mirror',
'Hill_Assist',
'Gear_Indicator',
'3_Point_Seat-Belt_in_Middle_Rear_Seat',
'Ambient_Lightning',
'Cargo/Boot_Lights',
'Drive_Modes',
'Engine_Immobilizer',
'High_Speed_Alert_System',
'Lane_Watch_Camera/_Side_Mirror_Camera',
'Passenger_Side_Seat-Belt_Reminder',
'Seat_Back_Pockets','Headlight_Reminder',
'Adjustable_Headrests',
'Gross_Vehicle_Weight','Door_Ajar_Warning',
'EBD_(Electronic_Brake-force_Distribution)',
'Fasten_Seat_Belt_Warning',
'Gear_Shift_Reminder',
'Number_of_Airbags',
'Compression_Ratio',
'Adjustable_Steering_Column',
'Other_Specs',
'Other_specs','Key_Off_Reminder',
'USB_Compatibility',
'Android_Auto',
'Apple_CarPlay',
'Cigarette_Lighter',
'Infotainment_Screen',
'Multifunction_Steering_Wheel',
'Average_Speed',
'EBA_(Electronic_Brake_Assist)',
'Seat_Height_Adjustment',
'Navigation_System',
'Second_Row_AC_Vents',
'Tyre_Pressure_Monitoring_System',
'Rear_Center_Armrest',
'iPod_Compatibility',
'ESP_(Electronic_Stability_Program)',
'Cooled_Glove_Box',
'Recommended_Tyre_Pressure',
'Heated_Seats',
'Turbocharger',
'ISOFIX_(Child-Seat_Mount)',
'Rain_Sensing_Wipers',
'Paddle_Shifters',
'Leather_Wrapped_Steering',
'Automatic_Headlamps',
'Engine_Type',
'ASR_/_Traction_Control',
'Cruise_Control',
'USB_Ports',
'Heads-Up_Display',
'Welcome_Lights',
'Battery',
'Electric_Range'], axis=1,inplace=True)
df.columns
Index(['Make', 'Model', 'Variant', 'Ex-Showroom_Price', 'Displacement',
'Cylinders', 'Valves_Per_Cylinder', 'Fuel_Tank_Capacity', 'Fuel_Type',
'Height', 'Length', 'Width', 'Body_Type', 'Doors', 'City_Mileage',
'Highway_Mileage', 'Kerb_Weight', 'Gears', 'Power', 'Speedometer',
'Seating_Capacity', 'Seats_Material', 'Type', 'Wheelbase',
'Wheels_Size', 'Audiosystem', 'Basic_Warranty', 'Bluetooth',
'CD_/_MP3_/_DVD_Player', 'Voice_Recognition', 'Walk_Away_Auto_Car_Lock',
'ABS_(Anti-lock_Braking_System)', 'Airbags', 'Parking_Assistance',
'2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012',
'2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003',
'2002', '2001'],
dtype='object')
df["Mileage"]=(df.City_Mileage+df.Highway_Mileage)/2
df.drop(['City_Mileage',
'Highway_Mileage'],axis=1,inplace=True)
column = df.pop('Mileage')
df.insert(10, 'Mileage', column)
df.columns
Index(['Make', 'Model', 'Variant', 'Ex-Showroom_Price', 'Displacement',
'Cylinders', 'Valves_Per_Cylinder', 'Fuel_Tank_Capacity', 'Fuel_Type',
'Height', 'Mileage', 'Length', 'Width', 'Body_Type', 'Doors',
'Kerb_Weight', 'Gears', 'Power', 'Speedometer', 'Seating_Capacity',
'Seats_Material', 'Type', 'Wheelbase', 'Wheels_Size', 'Audiosystem',
'Basic_Warranty', 'Bluetooth', 'CD_/_MP3_/_DVD_Player',
'Voice_Recognition', 'Walk_Away_Auto_Car_Lock',
'ABS_(Anti-lock_Braking_System)', 'Airbags', 'Parking_Assistance',
'2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012',
'2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004', '2003',
'2002', '2001'],
dtype='object')
for i in range(len(df.Parking_Assistance)):
if type(df.Parking_Assistance[i])!=str:
df.Parking_Assistance[i]=0
else:
df.Parking_Assistance[i]=1
df.Parking_Assistance
0 0
1 0
2 0
3 0
4 0
..
768 1
769 1
770 1
771 1
772 1
Name: Parking_Assistance, Length: 773, dtype: object
for i in range(len(df.Power)):
df.Power[i]=int(''.join(filter(str.isdigit, df.Power[i].split("@")[0])))
#using oneHotEncoding
def ohe(c93):
merged=copy.deepcopy(c93)
for i in c93.columns:
if c93[i].dtypes==object:
dummy=pd.get_dummies(c93[i],drop_first=True)
merged=pd.concat([merged,dummy],axis='columns')
merged.drop(i,axis=1,inplace=True)
return merged
#-------------------------------------------------------------
def le1(c93):
lbcode = LabelEncoder()
merged=copy.deepcopy(c93)
for i in c93.columns:
if c93[i].dtypes==object:
merged[i] = lbcode.fit_transform(merged[i])
return merged
#--------------------------------------------------------------
def le2(c93):
orcode = OrdinalEncoder()
for i in c93.columns:
if c93[i].dtypes==object:
c93_encode = orcode.fit_transform(c93[[i]])
print(i,c93_encode,'\n')
#--------------------------------------------------------------
def le3(c93):
ohe=OneHotEncoder(sparse=False)
for i in c93.columns:
if c93[i].dtypes==object:
arr = ohe.fit_transform(c93[[i]])
print(i,arr,'\n')
le=LabelEncoder()
for i in df.columns:
if df[i].dtypes==object:
df[i]=le.fit_transform(df[[i]])
from sklearn.impute import KNNImputer
imp = KNNImputer(n_neighbors=5)
for i in df.columns:
if df[i].dtype!=object:df[i]=imp.fit_transform(np.array(df[i]).reshape(-1,1))
# Scatterplot Matrix
from pandas.plotting import scatter_matrix
scatter_matrix(df,figsize=(50,50))
plt.show()
#choose multiplier
#drop rows with outliers
"""
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
multiplier=10
df = df[~((df < (Q1 - multiplier * IQR)) |(df > (Q3 + multiplier * IQR))).any(axis=1)]
df.shape
"""
# this can also be done for a specific column instead of the entire dataframe.
# the columns can be chosen based on the above scatterplots
'\nQ1 = df.quantile(0.25)\nQ3 = df.quantile(0.75)\nIQR = Q3 - Q1\nmultiplier=10\ndf = df[~((df < (Q1 - multiplier * IQR)) |(df > (Q3 + multiplier * IQR))).any(axis=1)]\ndf.shape\n'
"""
z=stats.zscore(df.Power)
df.Power[(z < 3)]
# drop those rows...
"""
'\nz=stats.zscore(df.Power)\ndf.Power[(z < 3)]\n# drop those rows...\n'
Standard scaling, feature reduction using pca, normalization and data smoothening is not recommended since it may cause loss of important information such as the price of each car and its sales
Generally, correlation matrices must be formed after normalizing/standard scaling the data, but that can lead to loss of important information, especially for the automotive industry. Hence, a copy of the dataset caan be created and the correlations can be formed by scaling or normalizing that copy
import copy
copy=copy.deepcopy(df)
alpha=0.3
copy=copy.ewm(alpha=alpha).mean()
copy
| Make | Model | Variant | Ex-Showroom_Price | Displacement | Cylinders | Valves_Per_Cylinder | Fuel_Tank_Capacity | Fuel_Type | Height | ... | 2010 | 2009 | 2008 | 2007 | 2006 | 2005 | 2004 | 2003 | 2002 | 2001 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 32.000000 | 91.000000 | 592.000000 | 2.926670e+05 | 624.000000 | 2.000000 | 2.000000 | 24.000000 | 5.000000 | 1652.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 1 | 32.000000 | 91.000000 | 579.058824 | 2.595964e+05 | 624.000000 | 2.000000 | 2.000000 | 24.000000 | 5.000000 | 1652.000000 | ... | 538.235294 | 450.588235 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2 | 32.000000 | 91.000000 | 428.347032 | 2.765209e+05 | 624.000000 | 2.000000 | 2.000000 | 19.890411 | 2.716895 | 1652.000000 | ... | 292.465753 | 244.840183 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 3 | 32.000000 | 91.000000 | 495.324122 | 2.995162e+05 | 624.000000 | 2.000000 | 2.000000 | 21.512831 | 3.618239 | 1652.000000 | ... | 177.003553 | 148.180024 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 4 | 32.000000 | 91.000000 | 524.777037 | 2.896741e+05 | 624.000000 | 2.000000 | 2.000000 | 22.409722 | 4.116512 | 1652.000000 | ... | 148.153691 | 156.769680 | 26.684937 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 768 | 26.372926 | 54.595731 | 223.256613 | 3.242351e+06 | 2826.326740 | 4.329442 | 3.999994 | 78.992091 | 2.781301 | 1814.006580 | ... | 954.397095 | 1091.856904 | 1169.541079 | 867.333827 | 887.630045 | 1012.000418 | 893.780017 | 737.903565 | 700.853131 | 505.631516 |
| 769 | 21.761048 | 52.617012 | 191.379629 | 3.239546e+06 | 2637.828718 | 4.230610 | 3.999996 | 79.294463 | 2.546911 | 1820.904606 | ... | 2032.477966 | 1776.199833 | 911.078755 | 607.133679 | 621.341031 | 708.400292 | 625.646012 | 516.532496 | 490.597192 | 353.942061 |
| 770 | 18.532734 | 51.231908 | 169.365740 | 3.143682e+06 | 2505.880102 | 4.161427 | 3.999997 | 79.506124 | 2.382837 | 1825.733224 | ... | 1856.834577 | 1563.439883 | 955.455129 | 851.893575 | 839.038722 | 953.080205 | 726.852208 | 549.372747 | 379.118034 | 247.759443 |
| 771 | 20.772914 | 46.062336 | 184.556018 | 3.097577e+06 | 2202.916072 | 3.812999 | 3.999998 | 68.854287 | 2.267986 | 1702.213257 | ... | 3217.384204 | 2184.307918 | 1232.518590 | 1565.925503 | 1651.427105 | 2051.356143 | 1679.096546 | 836.660923 | 265.382624 | 173.431610 |
| 772 | 22.341040 | 42.443635 | 195.489213 | 3.194304e+06 | 2141.441250 | 3.869099 | 3.999999 | 61.398001 | 3.087590 | 1615.749280 | ... | 2252.168943 | 1529.015543 | 862.763013 | 1096.147852 | 1155.998974 | 1435.949300 | 1175.367582 | 585.662646 | 185.767837 | 121.402127 |
773 rows × 53 columns
from sklearn.preprocessing import StandardScaler
scale= StandardScaler()
for i in copy.columns:
copy[i]=scale.fit_transform(np.array(copy[i]).reshape(-1,1))
copy
| Make | Model | Variant | Ex-Showroom_Price | Displacement | Cylinders | Valves_Per_Cylinder | Fuel_Tank_Capacity | Fuel_Type | Height | ... | 2010 | 2009 | 2008 | 2007 | 2006 | 2005 | 2004 | 2003 | 2002 | 2001 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.420945 | 0.574255 | 1.647875 | -0.319187 | -1.254463 | -1.662544 | -3.874900 | -0.569832 | 1.667709 | 0.503058 | ... | -0.608217 | -0.575313 | -0.587975 | -0.582323 | -0.583807 | -0.602112 | -0.605540 | -0.573133 | -0.561638 | -0.582211 |
| 1 | 1.420945 | 0.574255 | 1.561225 | -0.322632 | -1.254463 | -1.662544 | -3.874900 | -0.569832 | 1.667709 | 0.503058 | ... | -0.456617 | -0.448079 | -0.587975 | -0.582323 | -0.583807 | -0.602112 | -0.605540 | -0.573133 | -0.561638 | -0.582211 |
| 2 | 1.420945 | 0.574255 | 0.552106 | -0.320869 | -1.254463 | -1.662544 | -3.874900 | -0.642833 | -0.837321 | 0.503058 | ... | -0.525841 | -0.506177 | -0.587975 | -0.582323 | -0.583807 | -0.602112 | -0.605540 | -0.573133 | -0.561638 | -0.582211 |
| 3 | 1.420945 | 0.574255 | 1.000564 | -0.318474 | -1.254463 | -1.662544 | -3.874900 | -0.614013 | 0.151637 | 0.503058 | ... | -0.558362 | -0.533471 | -0.587975 | -0.582323 | -0.583807 | -0.602112 | -0.605540 | -0.573133 | -0.561638 | -0.582211 |
| 4 | 1.420945 | 0.574255 | 1.197771 | -0.319499 | -1.254463 | -1.662544 | -3.874900 | -0.598081 | 0.698343 | 0.503058 | ... | -0.566488 | -0.531045 | -0.579884 | -0.582323 | -0.583807 | -0.602112 | -0.605540 | -0.573133 | -0.561638 | -0.582211 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 768 | 0.636228 | -0.512668 | -0.821116 | -0.011916 | 1.342880 | 0.125611 | 0.071935 | 0.407020 | -0.766654 | 1.977005 | ... | -0.339401 | -0.267003 | -0.233337 | -0.326361 | -0.264434 | -0.141051 | -0.092530 | -0.046686 | 0.136211 | 0.261836 |
| 769 | -0.006916 | -0.571747 | -1.034554 | -0.012208 | 1.120573 | 0.049743 | 0.071939 | 0.412392 | -1.023828 | 2.039764 | ... | -0.035748 | -0.073763 | -0.311710 | -0.403149 | -0.360246 | -0.279369 | -0.246433 | -0.204620 | -0.073144 | 0.008622 |
| 770 | -0.457117 | -0.613102 | -1.181952 | -0.022194 | 0.964957 | -0.003364 | 0.071941 | 0.416151 | -1.203850 | 2.083695 | ... | -0.085220 | -0.133841 | -0.298254 | -0.330917 | -0.281918 | -0.167895 | -0.188343 | -0.181190 | -0.184145 | -0.168628 |
| 771 | -0.144716 | -0.767450 | -1.080243 | -0.026997 | 0.607653 | -0.270828 | 0.071943 | 0.226937 | -1.329865 | 0.959902 | ... | 0.297994 | 0.041475 | -0.214240 | -0.120197 | 0.010383 | 0.332473 | 0.358224 | 0.023772 | -0.297393 | -0.292703 |
| 772 | 0.073966 | -0.875494 | -1.007037 | -0.016921 | 0.535152 | -0.227764 | 0.071945 | 0.094488 | -0.430593 | 0.173246 | ... | 0.026131 | -0.143561 | -0.326361 | -0.258835 | -0.167874 | 0.052097 | 0.069095 | -0.155300 | -0.376667 | -0.379556 |
773 rows × 53 columns
print(copy.corr())
fig, ax = plt.subplots(figsize=(100,100))
sns.heatmap(copy.corr(), cmap="YlGnBu", ax=ax)
Make Model Variant \
Make 1.000000 0.208475 0.253510
Model 0.208475 1.000000 0.085415
Variant 0.253510 0.085415 1.000000
Ex-Showroom_Price -0.115716 -0.018142 0.009160
Displacement -0.075206 0.026471 -0.074365
Cylinders -0.145499 -0.054522 -0.116865
Valves_Per_Cylinder -0.218304 -0.032458 0.023025
Fuel_Tank_Capacity -0.078310 0.038498 -0.024173
Fuel_Type -0.021992 -0.077561 0.045562
Height 0.079839 0.140346 0.042257
Mileage -0.002513 -0.028645 -0.062856
Length -0.021101 0.100128 -0.226275
Width -0.077805 0.169530 -0.136537
Body_Type -0.077230 0.098208 -0.181694
Doors 0.139971 0.004283 0.087467
Kerb_Weight -0.133579 -0.199085 -0.247041
Gears -0.013906 0.035437 -0.076801
Power -0.095095 0.032931 -0.173772
Speedometer -0.120382 0.033813 -0.099986
Seating_Capacity 0.126563 0.125840 0.041827
Seats_Material -0.076378 0.175931 -0.284230
Type 0.033284 -0.086956 0.011190
Wheelbase -0.036165 0.084354 -0.222958
Wheels_Size -0.101490 0.203160 -0.171305
Audiosystem 0.061301 -0.277678 0.284214
Basic_Warranty 0.057714 0.124746 -0.094087
Bluetooth -0.010305 -0.085201 0.152288
CD_/_MP3_/_DVD_Player -0.035144 -0.206113 0.085889
Voice_Recognition 0.199049 0.045625 0.079042
Walk_Away_Auto_Car_Lock 0.264290 0.023262 0.093925
ABS_(Anti-lock_Braking_System) -0.033435 -0.034147 0.267067
Airbags -0.031306 0.036009 0.170229
Parking_Assistance -0.105118 -0.009136 -0.145919
2020 0.015468 0.022929 0.135239
2019 0.005010 -0.025064 0.124310
2018 -0.008100 -0.039486 0.093668
2017 -0.022235 -0.041271 0.075335
2016 -0.022815 -0.060117 0.040416
2015 -0.010044 -0.066281 0.027096
2014 -0.008843 -0.060062 0.017133
2013 -0.011172 -0.064457 0.012024
2012 -0.009170 -0.057592 0.005299
2011 -0.017278 -0.054700 -0.031533
2010 -0.010799 -0.031360 -0.055540
2009 -0.004657 -0.036278 -0.024645
2008 0.007526 -0.060613 -0.058288
2007 0.002874 -0.053609 -0.059169
2006 -0.028341 -0.040278 -0.045639
2005 -0.022707 -0.047797 -0.040838
2004 -0.023089 -0.053334 -0.033640
2003 0.005454 -0.064037 -0.025720
2002 0.033423 -0.056388 -0.044466
2001 0.080530 -0.058918 -0.051113
Ex-Showroom_Price Displacement Cylinders \
Make -0.115716 -0.075206 -0.145499
Model -0.018142 0.026471 -0.054522
Variant 0.009160 -0.074365 -0.116865
Ex-Showroom_Price 1.000000 0.795188 0.838380
Displacement 0.795188 1.000000 0.902160
Cylinders 0.838380 0.902160 1.000000
Valves_Per_Cylinder 0.036746 0.141948 0.097594
Fuel_Tank_Capacity 0.338944 0.565761 0.490547
Fuel_Type 0.210892 0.004770 0.073650
Height -0.147219 0.133143 -0.072956
Mileage 0.003464 0.076727 0.036918
Length 0.384321 0.766015 0.623395
Width 0.389666 0.730940 0.596703
Body_Type 0.007320 0.179813 0.162047
Doors -0.513613 -0.543241 -0.566929
Kerb_Weight -0.043837 -0.063152 -0.030376
Gears 0.290103 0.440566 0.452555
Power 0.513142 0.820511 0.729666
Speedometer 0.100452 0.032931 0.033468
Seating_Capacity -0.311992 -0.050832 -0.262279
Seats_Material 0.384793 0.598961 0.528216
Type -0.389581 -0.567440 -0.469751
Wheelbase 0.446363 0.786926 0.651565
Wheels_Size 0.407756 0.617190 0.520861
Audiosystem 0.067797 -0.017464 0.031136
Basic_Warranty 0.194254 0.219610 0.298713
Bluetooth -0.143551 -0.323512 -0.323833
CD_/_MP3_/_DVD_Player -0.136109 -0.325334 -0.320540
Voice_Recognition -0.219807 -0.190832 -0.188798
Walk_Away_Auto_Car_Lock -0.217931 -0.159362 -0.165533
ABS_(Anti-lock_Braking_System) -0.031729 -0.184391 -0.195045
Airbags 0.108251 -0.022966 0.024250
Parking_Assistance 0.145808 0.346522 0.306858
2020 0.116692 0.151386 0.108154
2019 0.094731 0.137781 0.103406
2018 0.099199 0.138692 0.108914
2017 0.095631 0.121817 0.097039
2016 0.072035 0.108117 0.080966
2015 0.032750 0.074535 0.042942
2014 0.011265 0.065275 0.030579
2013 0.015755 0.069295 0.033503
2012 0.019075 0.076272 0.029046
2011 0.003830 0.058935 0.016894
2010 0.001761 0.065749 0.017867
2009 -0.013582 0.056858 0.002207
2008 -0.024890 0.038771 0.002837
2007 -0.006691 0.039776 0.012457
2006 0.004146 0.044899 0.021810
2005 0.022745 0.058172 0.032112
2004 0.017314 0.048318 0.027538
2003 0.020139 0.043726 0.020634
2002 0.027058 0.046905 0.023154
2001 0.043522 0.068427 0.031733
Valves_Per_Cylinder Fuel_Tank_Capacity \
Make -0.218304 -0.078310
Model -0.032458 0.038498
Variant 0.023025 -0.024173
Ex-Showroom_Price 0.036746 0.338944
Displacement 0.141948 0.565761
Cylinders 0.097594 0.490547
Valves_Per_Cylinder 1.000000 0.062792
Fuel_Tank_Capacity 0.062792 1.000000
Fuel_Type 0.071125 0.032150
Height -0.071956 0.081234
Mileage 0.002455 0.281389
Length 0.228045 0.414888
Width 0.182482 0.430216
Body_Type 0.060577 0.083175
Doors -0.110578 -0.271047
Kerb_Weight -0.009843 -0.039913
Gears 0.193782 0.350491
Power 0.259934 0.459501
Speedometer -0.024578 -0.038011
Seating_Capacity 0.005090 -0.092388
Seats_Material 0.194238 0.400613
Type -0.248846 -0.343209
Wheelbase 0.221225 0.413505
Wheels_Size 0.006568 0.372093
Audiosystem 0.025098 -0.018309
Basic_Warranty 0.092205 0.082739
Bluetooth -0.181965 -0.209915
CD_/_MP3_/_DVD_Player -0.124616 -0.209993
Voice_Recognition -0.113612 -0.018233
Walk_Away_Auto_Car_Lock -0.122964 -0.016126
ABS_(Anti-lock_Braking_System) -0.243578 -0.118617
Airbags -0.178514 0.014427
Parking_Assistance 0.266482 0.206278
2020 0.025518 0.002023
2019 0.052571 0.001751
2018 0.033120 0.001765
2017 0.020534 -0.000950
2016 0.004079 -0.013114
2015 0.007216 -0.041144
2014 0.008595 -0.047113
2013 0.008817 -0.043714
2012 0.015946 -0.037033
2011 0.013049 -0.045157
2010 -0.008751 -0.042896
2009 -0.017025 -0.042815
2008 -0.049046 -0.049399
2007 -0.061956 -0.046491
2006 -0.047748 -0.040973
2005 -0.029689 -0.037917
2004 -0.046000 -0.043629
2003 -0.047788 -0.045429
2002 -0.056333 -0.043184
2001 -0.062636 -0.039925
Fuel_Type Height ... 2010 2009 \
Make -0.021992 0.079839 ... -0.010799 -0.004657
Model -0.077561 0.140346 ... -0.031360 -0.036278
Variant 0.045562 0.042257 ... -0.055540 -0.024645
Ex-Showroom_Price 0.210892 -0.147219 ... 0.001761 -0.013582
Displacement 0.004770 0.133143 ... 0.065749 0.056858
Cylinders 0.073650 -0.072956 ... 0.017867 0.002207
Valves_Per_Cylinder 0.071125 -0.071956 ... -0.008751 -0.017025
Fuel_Tank_Capacity 0.032150 0.081234 ... -0.042896 -0.042815
Fuel_Type 1.000000 -0.445415 ... -0.087866 -0.101351
Height -0.445415 1.000000 ... 0.066520 0.070962
Mileage 0.093195 -0.050732 ... -0.050324 -0.047328
Length -0.262612 0.227007 ... 0.150035 0.147336
Width -0.266009 0.254775 ... 0.076259 0.075177
Body_Type -0.345959 0.136707 ... 0.080385 0.061599
Doors -0.112946 0.333427 ... 0.074348 0.086086
Kerb_Weight -0.041625 -0.091660 ... -0.082105 -0.077994
Gears 0.010405 -0.040043 ... 0.074695 0.068605
Power -0.107643 0.111514 ... 0.103753 0.096023
Speedometer -0.059259 0.196334 ... -0.017591 -0.042168
Seating_Capacity -0.465110 0.785823 ... 0.094159 0.104625
Seats_Material -0.060493 0.004517 ... 0.000520 -0.004417
Type -0.096328 0.148243 ... -0.058357 -0.063302
Wheelbase -0.183828 0.193973 ... 0.121981 0.122759
Wheels_Size -0.163078 0.320688 ... 0.079969 0.079446
Audiosystem 0.049507 -0.048633 ... 0.016289 0.020194
Basic_Warranty 0.060417 -0.051222 ... 0.115314 0.083417
Bluetooth 0.006477 0.124584 ... -0.069171 -0.078663
CD_/_MP3_/_DVD_Player 0.099779 0.104813 ... -0.114508 -0.122052
Voice_Recognition -0.036740 0.108912 ... -0.001062 0.000229
Walk_Away_Auto_Car_Lock -0.094539 0.075527 ... 0.032562 0.037988
ABS_(Anti-lock_Braking_System) -0.027755 0.187992 ... -0.122439 -0.122888
Airbags 0.017158 0.101550 ... -0.130311 -0.131385
Parking_Assistance -0.033688 -0.058274 ... 0.083728 0.087305
2020 -0.113364 0.082212 ... 0.717282 0.704241
2019 -0.117445 0.072288 ... 0.773154 0.759953
2018 -0.109487 0.068365 ... 0.818546 0.803232
2017 -0.097590 0.057430 ... 0.854174 0.836179
2016 -0.084362 0.061403 ... 0.895144 0.878750
2015 -0.082972 0.067882 ... 0.923819 0.910711
2014 -0.084154 0.070001 ... 0.935694 0.923896
2013 -0.082491 0.073799 ... 0.948150 0.937108
2012 -0.075320 0.070946 ... 0.968559 0.957965
2011 -0.065654 0.054414 ... 0.983298 0.964497
2010 -0.087866 0.066520 ... 1.000000 0.982726
2009 -0.101351 0.070962 ... 0.982726 1.000000
2008 -0.101182 0.076766 ... 0.960176 0.968303
2007 -0.104886 0.060226 ... 0.937428 0.946010
2006 -0.098825 0.043237 ... 0.912960 0.929158
2005 -0.104021 0.054503 ... 0.889007 0.903935
2004 -0.122677 0.055633 ... 0.862551 0.884884
2003 -0.113994 0.053367 ... 0.844577 0.865202
2002 -0.101578 0.055531 ... 0.812641 0.827817
2001 -0.102704 0.092178 ... 0.759007 0.768025
2008 2007 2006 2005 \
Make 0.007526 0.002874 -0.028341 -0.022707
Model -0.060613 -0.053609 -0.040278 -0.047797
Variant -0.058288 -0.059169 -0.045639 -0.040838
Ex-Showroom_Price -0.024890 -0.006691 0.004146 0.022745
Displacement 0.038771 0.039776 0.044899 0.058172
Cylinders 0.002837 0.012457 0.021810 0.032112
Valves_Per_Cylinder -0.049046 -0.061956 -0.047748 -0.029689
Fuel_Tank_Capacity -0.049399 -0.046491 -0.040973 -0.037917
Fuel_Type -0.101182 -0.104886 -0.098825 -0.104021
Height 0.076766 0.060226 0.043237 0.054503
Mileage -0.048655 -0.051815 -0.051611 -0.054745
Length 0.128840 0.118789 0.119867 0.132529
Width 0.050964 0.037554 0.048032 0.061676
Body_Type 0.068644 0.070442 0.072382 0.089699
Doors 0.109051 0.093499 0.081519 0.067409
Kerb_Weight -0.082351 -0.087290 -0.087963 -0.090490
Gears 0.056241 0.051726 0.052661 0.043054
Power 0.081340 0.079487 0.089642 0.097329
Speedometer -0.037170 -0.038611 -0.037986 -0.023125
Seating_Capacity 0.111113 0.097771 0.076567 0.087438
Seats_Material -0.011451 -0.007482 0.003252 0.011753
Type -0.029787 -0.034858 -0.042300 -0.053888
Wheelbase 0.095477 0.086495 0.088693 0.099371
Wheels_Size 0.068786 0.065957 0.075106 0.075682
Audiosystem 0.016307 0.025672 0.032336 0.040048
Basic_Warranty 0.095063 0.080919 0.074584 0.076788
Bluetooth -0.069034 -0.065213 -0.077171 -0.082368
CD_/_MP3_/_DVD_Player -0.119326 -0.122458 -0.137274 -0.136490
Voice_Recognition 0.009189 0.017027 0.008508 -0.002410
Walk_Away_Auto_Car_Lock 0.046752 0.047410 0.038037 0.014990
ABS_(Anti-lock_Braking_System) -0.119655 -0.100853 -0.104947 -0.106098
Airbags -0.144713 -0.130748 -0.131563 -0.133040
Parking_Assistance 0.082925 0.086838 0.106175 0.122062
2020 0.683737 0.671682 0.672234 0.679483
2019 0.726329 0.711250 0.709864 0.712452
2018 0.769192 0.754423 0.748913 0.747245
2017 0.808206 0.792126 0.787450 0.784457
2016 0.851004 0.826012 0.813117 0.803531
2015 0.875979 0.846551 0.833550 0.819972
2014 0.891944 0.863522 0.853644 0.834372
2013 0.909792 0.880845 0.864802 0.847890
2012 0.929203 0.900370 0.881110 0.865411
2011 0.936751 0.910986 0.892245 0.874111
2010 0.960176 0.937428 0.912960 0.889007
2009 0.968303 0.946010 0.929158 0.903935
2008 1.000000 0.986006 0.958722 0.932426
2007 0.986006 1.000000 0.979841 0.953585
2006 0.958722 0.979841 1.000000 0.983086
2005 0.932426 0.953585 0.983086 1.000000
2004 0.906593 0.932364 0.965650 0.986727
2003 0.891694 0.918514 0.947832 0.970084
2002 0.867365 0.898841 0.922937 0.948436
2001 0.820410 0.851258 0.867647 0.895052
2004 2003 2002 2001
Make -0.023089 0.005454 0.033423 0.080530
Model -0.053334 -0.064037 -0.056388 -0.058918
Variant -0.033640 -0.025720 -0.044466 -0.051113
Ex-Showroom_Price 0.017314 0.020139 0.027058 0.043522
Displacement 0.048318 0.043726 0.046905 0.068427
Cylinders 0.027538 0.020634 0.023154 0.031733
Valves_Per_Cylinder -0.046000 -0.047788 -0.056333 -0.062636
Fuel_Tank_Capacity -0.043629 -0.045429 -0.043184 -0.039925
Fuel_Type -0.122677 -0.113994 -0.101578 -0.102704
Height 0.055633 0.053367 0.055531 0.092178
Mileage -0.055985 -0.054225 -0.054860 -0.058926
Length 0.131912 0.129719 0.139481 0.170235
Width 0.060364 0.055977 0.054802 0.072160
Body_Type 0.098220 0.108213 0.130932 0.137814
Doors 0.059932 0.056989 0.041730 0.047782
Kerb_Weight -0.091942 -0.087428 -0.086561 -0.089753
Gears 0.040194 0.039300 0.037772 0.042717
Power 0.091737 0.090601 0.096138 0.117713
Speedometer -0.023490 -0.028578 -0.036334 -0.037495
Seating_Capacity 0.092796 0.096228 0.103968 0.148299
Seats_Material 0.014134 0.008102 0.010123 0.020130
Type -0.050458 -0.046679 -0.039764 -0.023933
Wheelbase 0.103084 0.097378 0.104264 0.132096
Wheels_Size 0.068334 0.059143 0.063509 0.082791
Audiosystem 0.056643 0.058010 0.043599 0.035159
Basic_Warranty 0.070389 0.072765 0.075454 0.094425
Bluetooth -0.085805 -0.096520 -0.085405 -0.056615
CD_/_MP3_/_DVD_Player -0.139706 -0.147393 -0.146553 -0.129998
Voice_Recognition -0.014284 -0.010566 0.010103 0.019162
Walk_Away_Auto_Car_Lock 0.007052 0.020102 0.032556 0.039537
ABS_(Anti-lock_Braking_System) -0.089296 -0.094677 -0.093962 -0.082036
Airbags -0.121141 -0.124823 -0.123440 -0.124149
Parking_Assistance 0.130746 0.132320 0.125227 0.117256
2020 0.665006 0.665127 0.645353 0.607763
2019 0.696147 0.688050 0.658355 0.610747
2018 0.731552 0.725606 0.694393 0.644986
2017 0.763901 0.758469 0.729586 0.681069
2016 0.782038 0.771788 0.737788 0.689043
2015 0.801705 0.789842 0.753937 0.703589
2014 0.815235 0.801889 0.765768 0.713647
2013 0.824896 0.810344 0.775736 0.724514
2012 0.840915 0.827944 0.795463 0.743372
2011 0.849997 0.836756 0.805357 0.750136
2010 0.862551 0.844577 0.812641 0.759007
2009 0.884884 0.865202 0.827817 0.768025
2008 0.906593 0.891694 0.867365 0.820410
2007 0.932364 0.918514 0.898841 0.851258
2006 0.965650 0.947832 0.922937 0.867647
2005 0.986727 0.970084 0.948436 0.895052
2004 1.000000 0.985468 0.956465 0.899610
2003 0.985468 1.000000 0.984176 0.938277
2002 0.956465 0.984176 1.000000 0.973790
2001 0.899610 0.938277 0.973790 1.000000
[53 rows x 53 columns]
<AxesSubplot:>
plt.scatter(df.Height,df.Displacement)
<matplotlib.collections.PathCollection at 0x259811a1730>
sns.boxplot(x=df.Height)
<AxesSubplot:xlabel='Height'>
df.to_csv("cars.csv")
specifications=pd.read_csv("./datasets/specifications.csv").sample(frac = 1)
annual_sales=pd.read_csv("./datasets/annual_sales.csv").sample(frac = 1)
df_new=pd.concat([specifications,annual_sales], axis=1, join="inner")
now repeat all the cleaning steps on this new dataframe